!pip install imblearn --user admin
Requirement already satisfied: imblearn in c:\users\sanke\appdata\roaming\python\python38\site-packages (0.0) Requirement already satisfied: admin in c:\users\sanke\appdata\roaming\python\python38\site-packages (0.0.1) Requirement already satisfied: imbalanced-learn in c:\users\sanke\appdata\roaming\python\python38\site-packages (from imblearn) (0.8.0) Requirement already satisfied: numpy>=1.13.3 in c:\users\sanke\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.19.2) Requirement already satisfied: scipy>=0.19.1 in c:\users\sanke\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.5.2) Requirement already satisfied: scikit-learn>=0.24 in c:\users\sanke\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.24.2) Requirement already satisfied: joblib>=0.11 in c:\users\sanke\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.17.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\sanke\anaconda3\lib\site-packages (from scikit-learn>=0.24->imbalanced-learn->imblearn) (2.1.0)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline
pd.set_option('display.max_columns',None)
sns.set(style="darkgrid", palette="pastel", color_codes=True)
sns.set_context('paper')
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = "plotly_dark"
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE,SelectFromModel
from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier ,ExtraTreesClassifier, AdaBoostClassifier,GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix,f1_score,roc_auc_score,roc_curve,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import plotly.figure_factory as ff
os.chdir("A:\Study Materials\Credit Card Default")
df = pd.read_csv(os.getcwd() + '\\Data\\UCI_Credit_Card.csv')
df = df.rename(columns={'default.payment.next.month': 'default', 'PAY_0': 'PAY_1'})
df.head(5)
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_1 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000.0 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | -2 | -2 | 3913.0 | 3102.0 | 689.0 | 0.0 | 0.0 | 0.0 | 0.0 | 689.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 |
| 1 | 2 | 120000.0 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | 0 | 2 | 2682.0 | 1725.0 | 2682.0 | 3272.0 | 3455.0 | 3261.0 | 0.0 | 1000.0 | 1000.0 | 1000.0 | 0.0 | 2000.0 | 1 |
| 2 | 3 | 90000.0 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 29239.0 | 14027.0 | 13559.0 | 14331.0 | 14948.0 | 15549.0 | 1518.0 | 1500.0 | 1000.0 | 1000.0 | 1000.0 | 5000.0 | 0 |
| 3 | 4 | 50000.0 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 46990.0 | 48233.0 | 49291.0 | 28314.0 | 28959.0 | 29547.0 | 2000.0 | 2019.0 | 1200.0 | 1100.0 | 1069.0 | 1000.0 | 0 |
| 4 | 5 | 50000.0 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | 0 | 0 | 8617.0 | 5670.0 | 35835.0 | 20940.0 | 19146.0 | 19131.0 | 2000.0 | 36681.0 | 10000.0 | 9000.0 | 689.0 | 679.0 | 0 |
From EDA we came up with some outcomes like some data labels are undocumented. Lets try to clean them up and then visualize them
fil = (df.EDUCATION == 5) | (df.EDUCATION == 6) | (df.EDUCATION == 0)
df.loc[fil, 'EDUCATION'] = 4
df.EDUCATION.value_counts()
2 14030 1 10585 3 4917 4 468 Name: EDUCATION, dtype: int64
df.loc[df.MARRIAGE == 0, 'MARRIAGE'] = 3
df.MARRIAGE.value_counts()
2 15964 1 13659 3 377 Name: MARRIAGE, dtype: int64
fil = (df.PAY_1 == -2) | (df.PAY_1 == -1) | (df.PAY_1 == 0)
df.loc[fil, 'PAY_1'] = 0
fil = (df.PAY_2 == -2) | (df.PAY_2 == -1) | (df.PAY_2 == 0)
df.loc[fil, 'PAY_2'] = 0
fil = (df.PAY_3 == -2) | (df.PAY_3 == -1) | (df.PAY_3 == 0)
df.loc[fil, 'PAY_3'] = 0
fil = (df.PAY_4 == -2) | (df.PAY_4 == -1) | (df.PAY_4 == 0)
df.loc[fil, 'PAY_4'] = 0
fil = (df.PAY_5 == -2) | (df.PAY_5 == -1) | (df.PAY_5 == 0)
df.loc[fil, 'PAY_5'] = 0
fil = (df.PAY_6 == -2) | (df.PAY_6 == -1) | (df.PAY_6 == 0)
df.loc[fil, 'PAY_6'] = 0
fig = make_subplots(rows= 3, cols=2,subplot_titles=['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', ])
fig.add_trace(go.Histogram(x= df["PAY_1"],name='PAY_1'),row = 1, col = 1)
fig.add_trace(go.Histogram(x= df["PAY_2"],name='PAY_2'),row = 2, col = 2)
fig.add_trace(go.Histogram(x= df["PAY_3"],name='PAY_3'),row = 3, col = 1)
fig.add_trace(go.Histogram(x= df["PAY_4"],name='PAY_4'),row = 1, col = 2)
fig.add_trace(go.Histogram(x= df["PAY_5"],name='PAY_5'),row = 2, col = 1)
fig.add_trace(go.Histogram(x= df["PAY_6"],name='PAY_6'),row = 3, col = 2)
fig.update_layout(bargap=0.2,height=600, width=800, title_text="Histogram Subplots of Previous Payment Status (After clubbing)")
fig.show()
import warnings
warnings.filterwarnings("ignore")
X = df.drop(['default','ID'], axis=1)
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)
LR = LogisticRegression(random_state=0)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('Accuracy:', accuracy_score(y_pred,y_test))
cv_scores = cross_val_score(LR, X, y, cv=5)
print()
print(classification_report(y_test, y_pred))
print()
print("Average 5-Fold CV Score: {}".format(round(np.mean(cv_scores),4)),", Standard deviation: {}".format(round(np.std(cv_scores),4)))
Accuracy: 0.7786666666666666
precision recall f1-score support
0 0.78 1.00 0.88 7009
1 0.40 0.00 0.00 1991
accuracy 0.78 9000
macro avg 0.59 0.50 0.44 9000
weighted avg 0.70 0.78 0.68 9000
Average 5-Fold CV Score: 0.7788 , Standard deviation: 0.0001
Basically, this is our base line score and have to find a better scoring Model Algo for thi problem. Baseline Accuracy : 0.77
From EDA it leads that, SEX, MARRIAGE, EDUCATION columns consist of Categorical Data Hence lets, convert them to Objects
df[['SEX','MARRIAGE','EDUCATION']] = df[['SEX','MARRIAGE','EDUCATION']].astype('object')
#One Hot encoding
df = pd.get_dummies(df)
df.head()
| ID | LIMIT_BAL | AGE | PAY_1 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default | SEX_1 | SEX_2 | EDUCATION_1 | EDUCATION_2 | EDUCATION_3 | EDUCATION_4 | MARRIAGE_1 | MARRIAGE_2 | MARRIAGE_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000.0 | 24 | 2 | 2 | 0 | 0 | 0 | 0 | 3913.0 | 3102.0 | 689.0 | 0.0 | 0.0 | 0.0 | 0.0 | 689.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 2 | 120000.0 | 26 | 0 | 2 | 0 | 0 | 0 | 2 | 2682.0 | 1725.0 | 2682.0 | 3272.0 | 3455.0 | 3261.0 | 0.0 | 1000.0 | 1000.0 | 1000.0 | 0.0 | 2000.0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 3 | 90000.0 | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 29239.0 | 14027.0 | 13559.0 | 14331.0 | 14948.0 | 15549.0 | 1518.0 | 1500.0 | 1000.0 | 1000.0 | 1000.0 | 5000.0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 3 | 4 | 50000.0 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 46990.0 | 48233.0 | 49291.0 | 28314.0 | 28959.0 | 29547.0 | 2000.0 | 2019.0 | 1200.0 | 1100.0 | 1069.0 | 1000.0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 5 | 50000.0 | 57 | 0 | 0 | 0 | 0 | 0 | 0 | 8617.0 | 5670.0 | 35835.0 | 20940.0 | 19146.0 | 19131.0 | 2000.0 | 36681.0 | 10000.0 | 9000.0 | 689.0 | 679.0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
Data Splitting
X = df.drop(['default','ID'], axis=1)
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)
# create the training df by remerging X_train and y_train
df_train = X_train.join(y_train)
Also, EDA shows us that the data was unbalanced: Lets see the distribution of samples in train dataset created above
df_majority = df_train[df_train.default == 0]
df_minority = df_train[df_train.default == 1]
print("-----------")
print(df_majority.default.count())
print("-----------")
print(df_minority.default.count())
print("-----------")
print(df_train.default.value_counts())
----------- 16355 ----------- 4645 ----------- 0 16355 1 4645 Name: default, dtype: int64
Now let's sample them accordingly
A. RANDOM OVERSAMPLING:
This is basically sampling random cases from minority target data and adding to dataset
df_minority_upsampled = resample(df_minority, replace=True, n_samples = 16355, random_state= 42) # reproducible results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.default.value_counts()
1 16355 0 16355 Name: default, dtype: int64
B. RANDOM UNDERSAMPLING
This is basically removing random cases from majority target data till we achieve desired level of balance
df_majority_downsampled = resample(df_majority, replace=False, n_samples=4645, random_state=587)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled.default.value_counts()
1 4645 0 4645 Name: default, dtype: int64
C. SMOTE: Synthetic Minority Oversampling Technique
Why SMOTE: Oversampling increases likelihood of Overfitting the model while, undrsampling decrease the number of record hence may affect accuracy by dissolving potential useful data.
How does SMOTE works ? |====> It basically create a line through examples and creates new samples with similar behavior on the basis of line relation.
sm = SMOTE(random_state=42)
X_SMOTE, y_SMOTE = sm.fit_resample(X_train, y_train)
print(len(y_SMOTE))
print(y_SMOTE.sum())
32710 16355
So as of now we have 4 Datasets of differently balanced levels:
Evaluation Criteria: K Folds accross AUC - ROC
def model_eval(algo, Xtrain,ytrain,Xtest,ytest):
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,classification_report
algo.fit(Xtrain,ytrain)
y_pred = algo.predict(Xtrain)
y_train_prob = algo.predict_proba(Xtrain)[:,1]
#print('confusion matrix-train\n',confusion_matrix(ytrain,y_pred))
print('Overall Train Accuracy',accuracy_score(ytrain,y_pred))
print('Train AUC Score',roc_auc_score(ytrain,y_train_prob))
y_test_pred = algo.predict(Xtest)
y_test_prob = algo.predict_proba(Xtest)[:,1]
#print('confusion matrix-test\n',confusion_matrix(ytest,y_test_pred))
print('Overall Test Accuracy',accuracy_score(ytest,y_test_pred))
print('Test AUC Score',roc_auc_score(ytest,y_test_prob))
print('Classification Report of Test\n', classification_report(ytest, y_test_pred))
kf = KFold(n_splits = 5,shuffle = True,random_state = 42)
score=[]
for train_idx,test_idx in kf.split(Xtrain,ytrain):
xtrain_k,xtest_k = Xtrain.iloc[train_idx,:],Xtrain.iloc[test_idx,:]
ytrain_k,ytest_k = ytrain.iloc[train_idx],ytrain.iloc[test_idx]
algo.fit(xtrain_k,ytrain_k)
y_pred_k = algo.predict(xtest_k)
roc = roc_auc_score(ytest_k,y_pred_k)
score.append(roc)
print('K-Fold scores: %0.03f (+/- %0.5f)' % (np.mean(score),np.var(score,ddof=1)))
f,ax = plt.subplots(1,2,figsize=(15,7))
ConfMatrix = confusion_matrix(ytest,y_test_pred)
sns.heatmap(ConfMatrix,annot=True, cmap='YlGnBu', fmt="d",
xticklabels = ['Non-default', 'Default'],
yticklabels = ['Non-default', 'Default'],linewidths=.5,ax = ax[0])
ax[0].set_ylabel('True label')
ax[0].set_xlabel('Predicted label')
ax[0].set_title('Confusion Matrix')
global fpr,tpr,thresholds
fpr,tpr,thresholds = roc_curve(ytest,y_test_prob)
ax[1].plot(fpr,tpr,color = 'r')
ax[1].plot(fpr,fpr,color = 'green')
ax[1].set_ylabel('TPR')
ax[1].set_xlabel('FPR')
ax[1].set_title('ROC Curve')
plt.show()
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
global best_log
print('Data is ',k)
best_log = []
# Setup the hyperparameter grid, (not scaled data)
param_grid = {'C': np.logspace(-5, 8, 15)}
# Instantiate a logistic regression classifier
logreg = LogisticRegression()
# Instantiate the RandomizedSearchCV object
logreg_cv = RandomizedSearchCV(logreg, param_grid ,scoring = 'roc_auc', cv=5, random_state=0)
# Fit it to the data
logreg_cv.fit(i,j)
best_log.append(logreg_cv.best_params_)
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("_" * 100)
Data is Normal Sampling
Tuned Logistic Regression Parameters: {'C': 0.05179474679231213}
____________________________________________________________________________________________________
Data is Over Sampling
Tuned Logistic Regression Parameters: {'C': 0.05179474679231213}
____________________________________________________________________________________________________
Data is Under Sampling
Tuned Logistic Regression Parameters: {'C': 19306.977288832535}
____________________________________________________________________________________________________
Data is SMOTE
Tuned Logistic Regression Parameters: {'C': 0.0007196856730011522}
____________________________________________________________________________________________________
param_log =[{'C': 0.05179474679231213},{'C': 0.05179474679231213},{'C': 19306.977288832535},{'C': 0.0007196856730011522}]
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
index = [0,1,2,3]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_log[l])
model_eval(LogisticRegression(**param_log[l],random_state= 42), i,j,X_test,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'C': 0.05179474679231213}
Overall Train Accuracy 0.7786190476190477
Train AUC Score 0.6571413725142401
Overall Test Accuracy 0.7787777777777778
Test AUC Score 0.6439834942789708
Classification Report of Test
precision recall f1-score support
0 0.78 1.00 0.88 7009
1 0.50 0.00 0.00 1991
accuracy 0.78 9000
macro avg 0.64 0.50 0.44 9000
weighted avg 0.72 0.78 0.68 9000
K-Fold scores: 0.500 (+/- 0.00000)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'C': 0.05179474679231213}
Overall Train Accuracy 0.6114643839804341
Train AUC Score 0.6530676004475374
Overall Test Accuracy 0.554
Test AUC Score 0.6469564961287126
Classification Report of Test
precision recall f1-score support
0 0.86 0.51 0.64 7009
1 0.29 0.71 0.41 1991
accuracy 0.55 9000
macro avg 0.58 0.61 0.53 9000
weighted avg 0.74 0.55 0.59 9000
K-Fold scores: 0.614 (+/- 0.00001)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'C': 19306.977288832535}
Overall Train Accuracy 0.6189451022604952
Train AUC Score 0.6627006828180816
Overall Test Accuracy 0.5517777777777778
Test AUC Score 0.6509890526774107
Classification Report of Test
precision recall f1-score support
0 0.86 0.50 0.64 7009
1 0.29 0.72 0.42 1991
accuracy 0.55 9000
macro avg 0.58 0.61 0.53 9000
weighted avg 0.74 0.55 0.59 9000
K-Fold scores: 0.618 (+/- 0.00006)
____________________________________________________________
Data is SMOTE And with hyper parameter {'C': 0.0007196856730011522}
Overall Train Accuracy 0.6048303271170896
Train AUC Score 0.6539774049877933
Overall Test Accuracy 0.6641111111111111
Test AUC Score 0.6389794523350512
Classification Report of Test
precision recall f1-score support
0 0.82 0.73 0.77 7009
1 0.32 0.45 0.37 1991
accuracy 0.66 9000
macro avg 0.57 0.59 0.57 9000
weighted avg 0.71 0.66 0.68 9000
K-Fold scores: 0.609 (+/- 0.00026)
____________________________________________________________
The best AUC score for Logistic Regression is seen in Under sampling data with an Test AUC score of 0.65 and K-fold score of 0.615
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
print('Data is ',k)
ss = StandardScaler()
a = ss.fit_transform(i)
xts = pd.DataFrame(a,columns = i.columns)
b = ss.transform(X_test)
model_eval(GaussianNB(), xts,j,b,y_test)
print("_" * 60)
Data is Normal Sampling
Overall Train Accuracy 0.765095238095238
Train AUC Score 0.7472296421006075
Overall Test Accuracy 0.7558888888888889
Test AUC Score 0.7358834902588829
Classification Report of Test
precision recall f1-score support
0 0.86 0.82 0.84 7009
1 0.46 0.54 0.50 1991
accuracy 0.76 9000
macro avg 0.66 0.68 0.67 9000
weighted avg 0.77 0.76 0.76 9000
K-Fold scores: 0.698 (+/- 0.00003)
____________________________________________________________
Data is Over Sampling
Overall Train Accuracy 0.6991745643534087
Train AUC Score 0.7461003448684843
Overall Test Accuracy 0.7514444444444445
Test AUC Score 0.7367858244107329
Classification Report of Test
precision recall f1-score support
0 0.87 0.81 0.83 7009
1 0.45 0.56 0.50 1991
accuracy 0.75 9000
macro avg 0.66 0.68 0.67 9000
weighted avg 0.77 0.75 0.76 9000
K-Fold scores: 0.700 (+/- 0.00001)
____________________________________________________________
Data is Under Sampling
Overall Train Accuracy 0.7036598493003229
Train AUC Score 0.7496985890589207
Overall Test Accuracy 0.7393333333333333
Test AUC Score 0.7404739862696446
Classification Report of Test
precision recall f1-score support
0 0.87 0.78 0.82 7009
1 0.43 0.59 0.50 1991
accuracy 0.74 9000
macro avg 0.65 0.69 0.66 9000
weighted avg 0.77 0.74 0.75 9000
K-Fold scores: 0.703 (+/- 0.00014)
____________________________________________________________
Data is SMOTE
Overall Train Accuracy 0.6585447875267503
Train AUC Score 0.7860818523135928
Overall Test Accuracy 0.5081111111111111
Test AUC Score 0.72204392587302
Classification Report of Test
precision recall f1-score support
0 0.88 0.43 0.57 7009
1 0.28 0.80 0.42 1991
accuracy 0.51 9000
macro avg 0.58 0.61 0.50 9000
weighted avg 0.75 0.51 0.54 9000
K-Fold scores: 0.661 (+/- 0.00031)
____________________________________________________________
The best score for Naive Bayes is registered when trained with the under sampling data with an Test AUC score of 0.74 and train K-Fold score of 0.70 after it was standardized
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
global best_xgb
ss = StandardScaler()
a = ss.fit_transform(i)
xts = pd.DataFrame(a,columns = i.columns)
best_knn = []
print('Data is ',k) # Instantiate a KNN classifier: tree
knn = KNeighborsClassifier()
# Setup the parameters and distributions to sample from: param_dist
params = {'n_neighbors' : sp_randint(1,20),
'p': sp_randint(1,5)}
rsearch_knn = RandomizedSearchCV(knn, param_distributions = params, cv = 3, random_state = 3 , n_iter = 50,n_jobs = -1)
rsearch_knn.fit(xts,j)
best_knn.append(rsearch_knn.best_params_)
print("Tuned KNN Parameters: {}".format(rsearch_knn.best_params_), "for",k)
print("_" * 100)
Data is Normal Sampling
Tuned KNN Parameters: {'n_neighbors': 17, 'p': 3} for Normal Sampling
____________________________________________________________________________________________________
Data is Over Sampling
Tuned KNN Parameters: {'n_neighbors': 1, 'p': 4} for Over Sampling
____________________________________________________________________________________________________
Data is Under Sampling
Tuned KNN Parameters: {'n_neighbors': 19, 'p': 4} for Under Sampling
____________________________________________________________________________________________________
Data is SMOTE
Tuned KNN Parameters: {'n_neighbors': 2, 'p': 1} for SMOTE
____________________________________________________________________________________________________
param_knn = [{'n_neighbors': 17, 'p': 3},{'n_neighbors': 1, 'p': 4},{'n_neighbors': 19, 'p': 4},{'n_neighbors': 2, 'p': 1}]
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
index = [0,1,2,3]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_knn[l])
ss = StandardScaler()
a = ss.fit_transform(i)
xts = pd.DataFrame(a,columns = i.columns)
b = ss.transform(X_test)
model_eval(KNeighborsClassifier(**param_knn[l]), xts,j,b,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'n_neighbors': 17, 'p': 3}
Overall Train Accuracy 0.8221904761904761
Train AUC Score 0.8174452333995557
Overall Test Accuracy 0.8101111111111111
Test AUC Score 0.7381798489837168
Classification Report of Test
precision recall f1-score support
0 0.83 0.95 0.89 7009
1 0.64 0.33 0.43 1991
accuracy 0.81 9000
macro avg 0.74 0.64 0.66 9000
weighted avg 0.79 0.81 0.79 9000
K-Fold scores: 0.636 (+/- 0.00001)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'n_neighbors': 1, 'p': 4}
Overall Train Accuracy 0.9988382757566493
Train AUC Score 0.9988382757566494
Overall Test Accuracy 0.7304444444444445
Test AUC Score 0.6039931869185339
Classification Report of Test
precision recall f1-score support
0 0.82 0.83 0.83 7009
1 0.39 0.38 0.38 1991
accuracy 0.73 9000
macro avg 0.61 0.60 0.60 9000
weighted avg 0.73 0.73 0.73 9000
K-Fold scores: 0.886 (+/- 0.00001)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'n_neighbors': 19, 'p': 4}
Overall Train Accuracy 0.716254036598493
Train AUC Score 0.7912631728967685
Overall Test Accuracy 0.7356666666666667
Test AUC Score 0.7398985619336091
Classification Report of Test
precision recall f1-score support
0 0.86 0.78 0.82 7009
1 0.43 0.57 0.49 1991
accuracy 0.74 9000
macro avg 0.65 0.68 0.65 9000
weighted avg 0.77 0.74 0.75 9000
K-Fold scores: 0.684 (+/- 0.00011)
____________________________________________________________
Data is SMOTE And with hyper parameter {'n_neighbors': 2, 'p': 1}
Overall Train Accuracy 0.9317028431672272
Train AUC Score 0.9868583826014836
Overall Test Accuracy 0.7708888888888888
Test AUC Score 0.6464780985113565
Classification Report of Test
precision recall f1-score support
0 0.81 0.92 0.86 7009
1 0.47 0.24 0.32 1991
accuracy 0.77 9000
macro avg 0.64 0.58 0.59 9000
weighted avg 0.73 0.77 0.74 9000
K-Fold scores: 0.847 (+/- 0.00003)
____________________________________________________________
The best score for KNN is registered when trained with the under sampling data with an Test AUC score of 0.74 and train K-Fold score of 0.68
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
print('Data is ',k)
# Instantiate a Decision Tree classifier: tree
dtc = DecisionTreeClassifier(random_state = 42)
# Setup the parameters and distributions to sample from: param_dist
params = {'max_depth': sp_randint(2,20),
'min_samples_leaf':sp_randint(1,20),
'min_samples_split':sp_randint(2,40),
'criterion':['gini','entropy']}
# Instantiate the RandomizedSearchCV object: tree_cv
rsearch_dt = RandomizedSearchCV(dtc, param_distributions= params, cv = 5, scoring = 'roc_auc',n_iter = 100,n_jobs = -1)
# Fit it to the data
rsearch_dt.fit(i,j)
print("Tuned Decision Tree Parameters: {}".format(rsearch_dt.best_params_), "for",name)
print("_" * 100)
Data is Normal Sampling
Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35} for ['Normal Sampling', 'Over Sampling', 'Under Sampling', 'SMOTE']
____________________________________________________________________________________________________
Data is Over Sampling
Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4} for ['Normal Sampling', 'Over Sampling', 'Under Sampling', 'SMOTE']
____________________________________________________________________________________________________
Data is Under Sampling
Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8} for ['Normal Sampling', 'Over Sampling', 'Under Sampling', 'SMOTE']
____________________________________________________________________________________________________
Data is SMOTE
Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7} for ['Normal Sampling', 'Over Sampling', 'Under Sampling', 'SMOTE']
____________________________________________________________________________________________________
param_dt = [{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35},
{'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4},
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8},
{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7}]
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
index = [0,1,2,3]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_dt[l])
model_eval(DecisionTreeClassifier(**param_dt[l],random_state= 42), i,j,X_test,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35}
Overall Train Accuracy 0.827047619047619
Train AUC Score 0.7764473062852303
Overall Test Accuracy 0.8172222222222222
Test AUC Score 0.7436531878114091
Classification Report of Test
precision recall f1-score support
0 0.84 0.95 0.89 7009
1 0.67 0.34 0.45 1991
accuracy 0.82 9000
macro avg 0.75 0.65 0.67 9000
weighted avg 0.80 0.82 0.79 9000
K-Fold scores: 0.659 (+/- 0.00004)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4}
Overall Train Accuracy 0.8948639559767655
Train AUC Score 0.9700655632383037
Overall Test Accuracy 0.7236666666666667
Test AUC Score 0.6522110948834601
Classification Report of Test
precision recall f1-score support
0 0.84 0.79 0.82 7009
1 0.40 0.49 0.44 1991
accuracy 0.72 9000
macro avg 0.62 0.64 0.63 9000
weighted avg 0.75 0.72 0.73 9000
K-Fold scores: 0.803 (+/- 0.00026)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8}
Overall Train Accuracy 0.709903121636168
Train AUC Score 0.7635696334241364
Overall Test Accuracy 0.7834444444444445
Test AUC Score 0.7479756063077112
Classification Report of Test
precision recall f1-score support
0 0.87 0.85 0.86 7009
1 0.51 0.54 0.52 1991
accuracy 0.78 9000
macro avg 0.69 0.70 0.69 9000
weighted avg 0.79 0.78 0.79 9000
K-Fold scores: 0.699 (+/- 0.00009)
____________________________________________________________
Data is SMOTE And with hyper parameter {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7}
Overall Train Accuracy 0.8715683277285234
Train AUC Score 0.9511617513475704
Overall Test Accuracy 0.7644444444444445
Test AUC Score 0.7102599807279426
Classification Report of Test
precision recall f1-score support
0 0.84 0.86 0.85 7009
1 0.46 0.42 0.44 1991
accuracy 0.76 9000
macro avg 0.65 0.64 0.65 9000
weighted avg 0.76 0.76 0.76 9000
K-Fold scores: 0.821 (+/- 0.00004)
____________________________________________________________
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
global best_rf
best_rf =[]
print('Data is ',k)
# Instantiate a Random forest classifier: tree
rfc = RandomForestClassifier(random_state = 42)
# Setup the parameters and distributions to sample from: param_dist
params = {'n_estimators' : sp_randint(50,200),
'max_features' : sp_randint(1,24),'max_depth': sp_randint(2,10),
'min_samples_leaf':sp_randint(1,20),
'min_samples_split':sp_randint(2,20),
'criterion':['gini','entropy']}
# Instantiate the RandomizedSearchCV object
rsearch_rfc = RandomizedSearchCV(rfc, param_distributions= params, cv = 5, scoring = 'roc_auc',n_iter = 200,random_state = 42,n_jobs = -1,return_train_score = True)
# Fit it to the data
rsearch_rfc.fit(i,j)
best_rf.append(rsearch_rfc.best_params_)
print("Tuned Random Tree Parameters: {}".format(rsearch_rfc.best_params_), "for",k)
print("_" * 100)
Data is Normal Sampling
Tuned Random Tree Parameters: {'criterion': 'entropy', 'max_depth': 9, 'max_features': 19, 'min_samples_leaf': 7, 'min_samples_split': 7, 'n_estimators': 183} for Normal Sampling
____________________________________________________________________________________________________
Data is Over Sampling
Tuned Random Tree Parameters: {'criterion': 'entropy', 'max_depth': 9, 'max_features': 22, 'min_samples_leaf': 1, 'min_samples_split': 12, 'n_estimators': 162} for Over Sampling
____________________________________________________________________________________________________
Data is Under Sampling
Tuned Random Tree Parameters: {'criterion': 'gini', 'max_depth': 9, 'max_features': 14, 'min_samples_leaf': 16, 'min_samples_split': 15, 'n_estimators': 164} for Under Sampling
____________________________________________________________________________________________________
Data is SMOTE
Tuned Random Tree Parameters: {'criterion': 'entropy', 'max_depth': 9, 'max_features': 15, 'min_samples_leaf': 2, 'min_samples_split': 11, 'n_estimators': 179} for SMOTE
____________________________________________________________________________________________________
param_rf = [{'criterion': 'entropy', 'max_depth': 9, 'max_features': 19, 'min_samples_leaf': 7, 'min_samples_split': 7, 'n_estimators': 183},
{'criterion': 'entropy', 'max_depth': 9, 'max_features': 22, 'min_samples_leaf': 1, 'min_samples_split': 12, 'n_estimators': 162},
{'criterion': 'gini', 'max_depth': 9, 'max_features': 14, 'min_samples_leaf': 16, 'min_samples_split': 15, 'n_estimators': 164},
{'criterion': 'entropy', 'max_depth': 9, 'max_features': 15, 'min_samples_leaf': 2, 'min_samples_split': 11, 'n_estimators': 179}]
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
index = [0,1,2,3]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_dt[l])
model_eval(RandomForestClassifier(**param_rf[l],random_state= 42), i,j,X_test,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35}
Overall Train Accuracy 0.842
Train AUC Score 0.8452947667123322
Overall Test Accuracy 0.8171111111111111
Test AUC Score 0.7765993124001651
Classification Report of Test
precision recall f1-score support
0 0.84 0.95 0.89 7009
1 0.67 0.35 0.46 1991
accuracy 0.82 9000
macro avg 0.75 0.65 0.67 9000
weighted avg 0.80 0.82 0.79 9000
K-Fold scores: 0.658 (+/- 0.00003)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4}
Overall Train Accuracy 0.7723937633751147
Train AUC Score 0.8683389571473875
Overall Test Accuracy 0.787
Test AUC Score 0.7712617679830317
Classification Report of Test
precision recall f1-score support
0 0.87 0.86 0.86 7009
1 0.52 0.55 0.53 1991
accuracy 0.79 9000
macro avg 0.69 0.70 0.70 9000
weighted avg 0.79 0.79 0.79 9000
K-Fold scores: 0.751 (+/- 0.00001)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8}
Overall Train Accuracy 0.7564047362755651
Train AUC Score 0.8467957605722092
Overall Test Accuracy 0.7644444444444445
Test AUC Score 0.7760706457701403
Classification Report of Test
precision recall f1-score support
0 0.88 0.81 0.84 7009
1 0.47 0.61 0.53 1991
accuracy 0.76 9000
macro avg 0.68 0.71 0.69 9000
weighted avg 0.79 0.76 0.77 9000
K-Fold scores: 0.712 (+/- 0.00011)
____________________________________________________________
Data is SMOTE And with hyper parameter {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7}
Overall Train Accuracy 0.8615408132069703
Train AUC Score 0.9374637142258181
Overall Test Accuracy 0.8102222222222222
Test AUC Score 0.770381970687182
Classification Report of Test
precision recall f1-score support
0 0.85 0.92 0.88 7009
1 0.60 0.43 0.50 1991
accuracy 0.81 9000
macro avg 0.72 0.68 0.69 9000
weighted avg 0.79 0.81 0.80 9000
K-Fold scores: 0.851 (+/- 0.00001)
____________________________________________________________
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
global best_ada
best_ada =[]
print('Data is ',k)
# Instantiate a Ada Boost classifier
ada = AdaBoostClassifier()
#Creating a grid of hyperparameters
param_grid = {'n_estimators': [200,300],
'algorithm': ['SAMME', 'SAMME.R'],
'learning_rate' : [0.5, 0.75, 1.0]}
#Building a 5 fold CV GridSearchCV object
grid_ada = GridSearchCV(ada, param_grid, scoring = 'accuracy', cv=5,n_jobs = -1)
#Fitting the grid to the training data
grid_ada.fit(i,j)
best_ada.append(grid_ada.best_params_)
print("Tuned Ada Boost Parameters: {}".format(grid_ada.best_params_), "for",k)
print("_" * 100)
Data is Normal Sampling
Tuned Ada Boost Parameters: {'algorithm': 'SAMME', 'learning_rate': 0.5, 'n_estimators': 300} for Normal Sampling
____________________________________________________________________________________________________
Data is Over Sampling
Tuned Ada Boost Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 300} for Over Sampling
____________________________________________________________________________________________________
Data is Under Sampling
Tuned Ada Boost Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.5, 'n_estimators': 200} for Under Sampling
____________________________________________________________________________________________________
Data is SMOTE
Tuned Ada Boost Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 300} for SMOTE
____________________________________________________________________________________________________
param_ada = [{'algorithm': 'SAMME', 'learning_rate': 0.5, 'n_estimators': 300},
{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 300},
{'algorithm': 'SAMME.R', 'learning_rate': 0.5, 'n_estimators': 200},
{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 300}]
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
index = [0,1,2,3]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_dt[l])
model_eval(AdaBoostClassifier(**param_ada[l]), i,j,X_test,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35}
Overall Train Accuracy 0.8211428571428572
Train AUC Score 0.7800456962858852
Overall Test Accuracy 0.8164444444444444
Test AUC Score 0.7677436178597669
Classification Report of Test
precision recall f1-score support
0 0.84 0.95 0.89 7009
1 0.67 0.34 0.45 1991
accuracy 0.82 9000
macro avg 0.75 0.65 0.67 9000
weighted avg 0.80 0.82 0.79 9000
K-Fold scores: 0.655 (+/- 0.00002)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4}
Overall Train Accuracy 0.7278508101498012
Train AUC Score 0.8068288633022979
Overall Test Accuracy 0.7577777777777778
Test AUC Score 0.7648981337691748
Classification Report of Test
precision recall f1-score support
0 0.87 0.80 0.84 7009
1 0.46 0.59 0.52 1991
accuracy 0.76 9000
macro avg 0.67 0.70 0.68 9000
weighted avg 0.78 0.76 0.77 9000
K-Fold scores: 0.718 (+/- 0.00001)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8}
Overall Train Accuracy 0.7222820236813778
Train AUC Score 0.7991749870516001
Overall Test Accuracy 0.7591111111111111
Test AUC Score 0.7692095883895851
Classification Report of Test
precision recall f1-score support
0 0.88 0.80 0.84 7009
1 0.47 0.61 0.53 1991
accuracy 0.76 9000
macro avg 0.67 0.70 0.68 9000
weighted avg 0.79 0.76 0.77 9000
K-Fold scores: 0.708 (+/- 0.00010)
____________________________________________________________
Data is SMOTE And with hyper parameter {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7}
Overall Train Accuracy 0.8627942525221645
Train AUC Score 0.925172296384456
Overall Test Accuracy 0.8136666666666666
Test AUC Score 0.7654376926157722
Classification Report of Test
precision recall f1-score support
0 0.84 0.94 0.89 7009
1 0.63 0.39 0.48 1991
accuracy 0.81 9000
macro avg 0.74 0.66 0.68 9000
weighted avg 0.80 0.81 0.80 9000
K-Fold scores: 0.859 (+/- 0.00001)
____________________________________________________________
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
global best_gbc
best_gbc =[]
print('Data is ',k)
# Instantiate a Gradient Boost classifier
gbc = GradientBoostingClassifier()
#Creating a grid of hyperparameters
param_grid = {'n_estimators': [200,300],
'learning_rate' : [0.5, 0.75, 1.0]}
#Building a 5 fold CV GridSearchCV object
grid_gbc = GridSearchCV(gbc, param_grid, scoring = 'accuracy', cv=5,n_jobs = -1)
#Fitting the grid to the training data
grid_gbc.fit(i,j)
best_gbc.append(grid_gbc.best_params_)
print("Tuned Random Tree Parameters: {}".format(grid_gbc.best_params_), "for",k)
print("_" * 100)
Data is Normal Sampling
Tuned Random Tree Parameters: {'learning_rate': 0.5, 'n_estimators': 200} for Normal Sampling
____________________________________________________________________________________________________
Data is Over Sampling
Tuned Random Tree Parameters: {'learning_rate': 1.0, 'n_estimators': 300} for Over Sampling
____________________________________________________________________________________________________
Data is Under Sampling
Tuned Random Tree Parameters: {'learning_rate': 0.5, 'n_estimators': 200} for Under Sampling
____________________________________________________________________________________________________
Data is SMOTE
Tuned Random Tree Parameters: {'learning_rate': 0.5, 'n_estimators': 200} for SMOTE
____________________________________________________________________________________________________
param_gbc = [{'learning_rate': 0.5, 'n_estimators': 200},
{'learning_rate': 1.0, 'n_estimators': 300},
{'learning_rate': 0.5, 'n_estimators': 200},
{'learning_rate': 0.5, 'n_estimators': 200}]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_dt[l])
model_eval(GradientBoostingClassifier(**param_gbc[l]), i,j,X_test,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35}
Overall Train Accuracy 0.8684761904761905
Train AUC Score 0.8970215538645874
Overall Test Accuracy 0.8048888888888889
Test AUC Score 0.7598651772898144
Classification Report of Test
precision recall f1-score support
0 0.84 0.93 0.88 7009
1 0.59 0.37 0.46 1991
accuracy 0.80 9000
macro avg 0.72 0.65 0.67 9000
weighted avg 0.78 0.80 0.79 9000
K-Fold scores: 0.653 (+/- 0.00002)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4}
Overall Train Accuracy 0.9003057169061449
Train AUC Score 0.9618269384353818
Overall Test Accuracy 0.7383333333333333
Test AUC Score 0.7180844976599292
Classification Report of Test
precision recall f1-score support
0 0.86 0.80 0.83 7009
1 0.43 0.53 0.47 1991
accuracy 0.74 9000
macro avg 0.64 0.67 0.65 9000
weighted avg 0.76 0.74 0.75 9000
K-Fold scores: 0.830 (+/- 0.00005)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8}
Overall Train Accuracy 0.8558665231431647
Train AUC Score 0.9372954471456165
Overall Test Accuracy 0.6966666666666667
Test AUC Score 0.7431303255862681
Classification Report of Test
precision recall f1-score support
0 0.87 0.71 0.79 7009
1 0.39 0.64 0.48 1991
accuracy 0.70 9000
macro avg 0.63 0.68 0.63 9000
weighted avg 0.77 0.70 0.72 9000
K-Fold scores: 0.684 (+/- 0.00017)
____________________________________________________________
Data is SMOTE And with hyper parameter {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7}
Overall Train Accuracy 0.8949250993579945
Train AUC Score 0.9599359461863475
Overall Test Accuracy 0.8045555555555556
Test AUC Score 0.7584600813519591
Classification Report of Test
precision recall f1-score support
0 0.84 0.92 0.88 7009
1 0.59 0.39 0.47 1991
accuracy 0.80 9000
macro avg 0.71 0.66 0.67 9000
weighted avg 0.79 0.80 0.79 9000
K-Fold scores: 0.857 (+/- 0.00002)
____________________________________________________________
xtrain_data = [X_train,df_upsampled.drop('default',axis = 1),df_downsampled.drop('default',axis = 1),X_SMOTE]
ytrain_data = [y_train,df_upsampled['default'],df_downsampled['default'],y_SMOTE]
name = ['Normal Sampling' , 'Over Sampling' , 'Under Sampling' , 'SMOTE']
for i,j,k in zip(xtrain_data,ytrain_data,name):
global best_xgb
best_xgb =[]
print('Data is ',k) # Instantiate a XGBoost classifier
xgb= XGBClassifier()
param_grid = {"learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30] ,
"max_depth" : [3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [1, 3, 5, 7],
"gamma" : [0.0, 0.1, 0.2 , 0.3, 0.4],
"colsample_bytree" : [0.3, 0.4, 0.5 , 0.7] }
#Building a 5 fold CV GridSearchCV object
xgb_RS = RandomizedSearchCV(xgb, param_grid ,cv = 5, scoring = 'roc_auc',n_iter = 200,n_jobs = -1)
#Fitting the grid to the training data
xgb_RS.fit(i,j)
best_xgb.append(xgb_RS.best_params_)
print("Tuned XG Boost Parameters: {}".format(xgb_RS.best_params_), "for",k)
print("_" * 100)
Data is Normal Sampling
[10:52:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Tuned XG Boost Parameters: {'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.4} for Normal Sampling
____________________________________________________________________________________________________
Data is Over Sampling
[11:12:36] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Tuned XG Boost Parameters: {'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.3, 'gamma': 0.2, 'colsample_bytree': 0.4} for Over Sampling
____________________________________________________________________________________________________
Data is Under Sampling
[11:19:12] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Tuned XG Boost Parameters: {'min_child_weight': 7, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.4} for Under Sampling
____________________________________________________________________________________________________
Data is SMOTE
[11:40:15] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Tuned XG Boost Parameters: {'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.2, 'gamma': 0.4, 'colsample_bytree': 0.7} for SMOTE
____________________________________________________________________________________________________
param_xgb = [{'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.05,'gamma': 0.3, 'colsample_bytree': 0.4},
{'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.3,'gamma': 0.2, 'colsample_bytree': 0.4},
{'min_child_weight': 7, 'max_depth': 4, 'learning_rate': 0.05,'gamma': 0.2, 'colsample_bytree': 0.4},
{'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.2,'gamma': 0.4, 'colsample_bytree': 0.7}]
index = [0,1,2,3]
for i,j,k,l in zip(xtrain_data,ytrain_data,name,index):
print('Data is ',k,' And with hyper parameter ',param_dt[l])
model_eval(XGBClassifier(**param_xgb[l]), i,j,X_test,y_test)
print("_" * 60)
Data is Normal Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 35}
[12:20:40] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Overall Train Accuracy 0.8325238095238096
Train AUC Score 0.8175531195464991
Overall Test Accuracy 0.8157777777777778
Test AUC Score 0.7810854007823335
Classification Report of Test
precision recall f1-score support
0 0.84 0.95 0.89 7009
1 0.66 0.34 0.45 1991
accuracy 0.82 9000
macro avg 0.75 0.65 0.67 9000
weighted avg 0.80 0.82 0.79 9000
[12:20:41] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:43] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
K-Fold scores: 0.654 (+/- 0.00002)
____________________________________________________________
Data is Over Sampling And with hyper parameter {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 3, 'min_samples_split': 4}
[12:20:45] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Overall Train Accuracy 0.9976765515132987
Train AUC Score 0.9999766997173031
Overall Test Accuracy 0.7965555555555556
Test AUC Score 0.7449044670198373
Classification Report of Test
precision recall f1-score support
0 0.84 0.91 0.87 7009
1 0.55 0.41 0.47 1991
accuracy 0.80 9000
macro avg 0.70 0.66 0.67 9000
weighted avg 0.78 0.80 0.78 9000
[12:20:48] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:50] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:53] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:55] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:20:58] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
K-Fold scores: 0.930 (+/- 0.00001)
____________________________________________________________
Data is Under Sampling And with hyper parameter {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 14, 'min_samples_split': 8}
[12:21:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Overall Train Accuracy 0.733799784714747
Train AUC Score 0.8114684238639879
Overall Test Accuracy 0.7595555555555555
Test AUC Score 0.779722942139614
Classification Report of Test
precision recall f1-score support
0 0.88 0.80 0.84 7009
1 0.47 0.62 0.53 1991
accuracy 0.76 9000
macro avg 0.67 0.71 0.69 9000
weighted avg 0.79 0.76 0.77 9000
[12:21:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
K-Fold scores: 0.715 (+/- 0.00007)
____________________________________________________________
Data is SMOTE And with hyper parameter {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 17, 'min_samples_split': 7}
[12:21:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Overall Train Accuracy 0.998135126872516
Train AUC Score 0.9999872068082809
Overall Test Accuracy 0.8021111111111111
Test AUC Score 0.7483989337379887
Classification Report of Test
precision recall f1-score support
0 0.84 0.92 0.88 7009
1 0.58 0.39 0.46 1991
accuracy 0.80 9000
macro avg 0.71 0.65 0.67 9000
weighted avg 0.78 0.80 0.79 9000
[12:21:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:13] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:16] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:20] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[12:21:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
K-Fold scores: 0.878 (+/- 0.00002)
____________________________________________________________
!pip install nbconvert
Requirement already satisfied: nbconvert in c:\users\sanke\anaconda3\lib\site-packages (6.0.7) Requirement already satisfied: mistune<2,>=0.8.1 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (0.8.4) Requirement already satisfied: traitlets>=4.2 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (5.0.5) Requirement already satisfied: nbformat>=4.4 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (5.0.8) Requirement already satisfied: entrypoints>=0.2.2 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (0.3) Requirement already satisfied: testpath in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (0.4.4) Requirement already satisfied: defusedxml in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (0.6.0) Requirement already satisfied: jinja2>=2.4 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (2.11.2) Requirement already satisfied: pygments>=2.4.1 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (2.7.2) Requirement already satisfied: bleach in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (3.2.1) Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (0.5.1) Requirement already satisfied: jupyter-core in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (4.6.3) Requirement already satisfied: jupyterlab-pygments in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (0.1.2) Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\sanke\anaconda3\lib\site-packages (from nbconvert) (1.4.3) Requirement already satisfied: ipython-genutils in c:\users\sanke\anaconda3\lib\site-packages (from traitlets>=4.2->nbconvert) (0.2.0) Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in c:\users\sanke\anaconda3\lib\site-packages (from nbformat>=4.4->nbconvert) (3.2.0) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\sanke\anaconda3\lib\site-packages (from jinja2>=2.4->nbconvert) (1.1.1) Requirement already satisfied: webencodings in c:\users\sanke\anaconda3\lib\site-packages (from bleach->nbconvert) (0.5.1) Requirement already satisfied: six>=1.9.0 in c:\users\sanke\anaconda3\lib\site-packages (from bleach->nbconvert) (1.15.0) Requirement already satisfied: packaging in c:\users\sanke\anaconda3\lib\site-packages (from bleach->nbconvert) (20.4) Requirement already satisfied: async-generator in c:\users\sanke\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert) (1.10) Requirement already satisfied: jupyter-client>=6.1.5 in c:\users\sanke\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert) (6.1.7) Requirement already satisfied: nest-asyncio in c:\users\sanke\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert) (1.4.2) Requirement already satisfied: pywin32>=1.0; sys_platform == "win32" in c:\users\sanke\anaconda3\lib\site-packages (from jupyter-core->nbconvert) (227) Requirement already satisfied: setuptools in c:\users\sanke\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (50.3.1.post20201107) Requirement already satisfied: attrs>=17.4.0 in c:\users\sanke\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (20.3.0) Requirement already satisfied: pyrsistent>=0.14.0 in c:\users\sanke\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert) (0.17.3) Requirement already satisfied: pyparsing>=2.0.2 in c:\users\sanke\anaconda3\lib\site-packages (from packaging->bleach->nbconvert) (2.4.7) Requirement already satisfied: pyzmq>=13 in c:\users\sanke\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient<0.6.0,>=0.5.0->nbconvert) (19.0.2) Requirement already satisfied: python-dateutil>=2.1 in c:\users\sanke\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient<0.6.0,>=0.5.0->nbconvert) (2.8.1) Requirement already satisfied: tornado>=4.1 in c:\users\sanke\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient<0.6.0,>=0.5.0->nbconvert) (6.0.4)
!pip install XeLaTeX
ERROR: Could not find a version that satisfies the requirement XeLaTeX (from versions: none) ERROR: No matching distribution found for XeLaTeX
!pip install texlive-xetex texlive-fonts-recommended texlive-latex-recommended
ERROR: Could not find a version that satisfies the requirement texlive-xetex (from versions: none) ERROR: No matching distribution found for texlive-xetex